Introduction

The Species Distribution Model (SDM) Benchmark Study Part 2 presents data from eight bird species across Colorado, Oregon, North Carolina, and Vermont, using environmental rasters from Part 1. The analysis segments data distribution by state and species and evaluates explanatory rasters using methods like Principal Component Analysis and Factor Analysis for Mixed Data. Additionally, the study addresses pseudo-absence generation and conducts feature engineering/selection to prepare for subsequent modeling.

Visualize Observations

The initial phase of the exploratory data analysis visualizes bird species observations across four U.S. states: Oregon, Vermont, Colorado, and North Carolina, covering the period from 2016 to 2019. These visual representations display the distribution of eight distinct bird species within each state while also highlighting major cities for geographical context. Such visualizations offer insights into spatial distribution patterns, potentially revealing habitat preferences, migration routes, or the influence of urban areas on these distributions. As the analysis progresses, understanding these distributions becomes pivotal, especially when addressing spatial autocorrelations and other features that influence the accuracy and validity of the Species Distribution Model (SDM) benchmarks.

data(us.cities)

# Get major cities for each sample region (state)
.states <- c("OR", "VT", "CO", "NC")
top.cities <- purrr::map_df(.states, function(s) {
  out <- us.cities %>% 
  filter(country.etc==s) %>%
  mutate(city = gsub(paste0(" ", s), "", name)) %>%
  arrange(-pop)
  if (s == "OR") {
    out <- out %>% 
      head() %>%
      filter(!(city %in% c("Gresham", "Hillsboro", "Corvallis",
                           "Beaverton", "Springfield")))
  } else if (s == "CO") {
    out <- out %>%
      head() %>%
      filter(!(city %in% c("Thornton", "Lakewood", "Aurora")))
  } else if (s == "NC") {
    out <- out %>%
      head() %>%
      filter(!(city %in% c("Greensboro", "Durham", "Fayetteville")))
  } else {
    out <- out %>% head()
  }
  out
})

# Load the map data
states <- map_data("state") %>% 
  filter(region %in% c("oregon", "north carolina", "colorado", "vermont"))

# Load your data
data.files <- list.files("data/final", full.names = T)

df <- purrr::map_df(data.files, readRDS) 

caps.after.ws <- function(string) {
  gsub("(?<=\\s)([a-z])", "\\U\\1", string, perl = T)
}

# Define a function to create a plot for each species
plot.for.species <- function(spec, st.abbr) {
  st <- case_when(st.abbr == "CO" ~ "colorado",
                  st.abbr == "NC" ~ "north carolina",
                  st.abbr == "VT" ~ "vermont",
                  st.abbr == "OR" ~ "oregon",
                  T ~ "")
  
  title <- caps.after.ws(paste(st.abbr, gsub("_", " ", spec), 
                             "Observations, 2016-2019"))
  
  p <- ggplot(data = states %>% filter(region == st)) +
    geom_polygon(aes(x = long, y = lat, group = group),
                 fill = "#989875", color = "black") +
    geom_point(data = df %>% filter(state == st.abbr & common.name == spec), 
               aes(x = lon, y = lat), 
               size=1, alpha=.5, fill = "red", shape=21) +
    geom_point(data = top.cities %>% filter(country.etc == st.abbr), 
               aes(x=long, y=lat),
               fill="gold", color="black", size=3.5, shape = 21) + 
    geom_text(data = top.cities %>% filter(country.etc == st.abbr), 
              aes(x=long, y=lat, label=city),
              color="white", hjust=case_when(st.abbr=="NC"~.2,
                                               st.abbr=="VT"~.65,
                                               T~.5),
              vjust=ifelse(st.abbr=="NC", -.65, 1.5),
              size=4) + 
    coord_map() +
    ggtitle(title) +
    theme_minimal() +
    theme(panel.background = element_blank(),
          axis.text = element_blank(),
          axis.title = element_blank(),
          axis.ticks = element_blank(),
          panel.grid = element_blank())

  data.table(
    state=st.abbr,
    species=spec,
    plot=list(p)
  )
}

spec.state <- expand.grid(unique(df$common.name), unique(df$state)) %>%
  rename(spec=Var1, st.abbr=Var2) 

# Create a list of plots
plots <- purrr::map2_df(spec.state$spec, 
                        spec.state$st.abbr, 
                        ~plot.for.species(.x, .y))
# Plot Ruddy Duck plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Ruddy Duck"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Belted Kingfisher plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Belted Kingfisher"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Wild Turkey plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Wild Turkey"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Sharp-Shinned Hawk plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Sharp-shinned Hawk"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Downy Woodpecker Plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Downy Woodpecker"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Cedar Waxwing Plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Cedar Waxwing"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Sandhill Crane Plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Sandhill Crane"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Sanderling Plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Sanderling"]$plot, 
          list(nrow=2, ncol=2)))

Explore Explanatory Rasters

The environmental rasters from Colorado, North Carolina, Oregon, and Vermont used in this study are meant to provide insights into which factors might potentially influence bird species distributions. The rasters were prepared in Part 1 of this study, and saved into 4 state-wide multi-layer rasters, each with 1000x1000 meter resoultions.

# Load prepared explanatory rasters by state
states <- c("CO", "NC", "OR", "VT")
r.files <- paste0("data/final_rasters/", states, ".tif")
r.list <- purrr::map(r.files, rast)
names(r.list) <- states

Land Cover Type Frequency

The table below shows the frequency distribution of various land cover types across the selected states. By evaluating this, the most common and rare habitats in each region can be ascertained. Moreover, understanding the prevalence of each land cover type can provide insights into potential hot-spots or habitats of interest for the bird species in question.

purrr::map_df(states, function(st) {
  freq.df <- terra::freq(r.list[[st]]$NLCD_Land) %>%
    mutate(state = st) %>%
    select(state, value, count)
}) 

Continuous Variable Distribution

Continuous rasters capture a gradient of values, often related to environmental gradients such as temperature, precipitation, or elevation. In this study, the distribution of such continuous variables for each state is illustrated using density plots. These plots provide a visual representation of the distribution patterns of each environmental variable, allowing for an assessment of the variations and patterns within each state. For instance, understanding the elevation range of Oregon compared to Colorado can offer insights into the altitudinal preferences of species. Additionally, spotting any anomalies or peaks in these distributions might suggest specific ecological or environmental significance, further aiding in the understanding and interpretation of model outputs.

# Plot densities of numeric rasters, by state
purrr::map(states, function(st) {
  r <- r.list[[st]]
  plots <- purrr::map(names(r)[names(r) != "NLCD_Land"], function(l) {
    ggplot(data=r[l] %>% as.data.frame()) + 
      geom_density(aes(x=!!sym(l))) + 
      theme_bw() +
      ggtitle(paste0(st, " Distribution for ", gsub(paste0("_", st), "", l)))
  })
  
  ggpubr::ggarrange(plotlist=plots, ncol=3, nrow=4) 
}) %>%
  ggpubr::ggarrange(plotlist=., ncol=1, nrow=4) + 
  ggtitle(paste0("Density Plots of Numeric Variables"))

Principal Component Analysis

In this section, the study employs Principal Component Analysis (PCA) to transform the high-dimensional data into a lower-dimensional form while retaining as much of the original variance as possible. The initial step involves merging data from different states into a single dataframe, followed by cleaning factor levels of the NLCD_Land column. Subsequently, the code converts factor columns into dummy variables and removes columns with only a single unique value. The PCA() function is used to perform the PCA, and the results are visualized using bar charts, displaying the importance of each variable for the first two dimensions. This visualization is important in understanding the significance of different environmental rasters and how they contribute to the variations captured by the principal components.

r.df <- map_df(states, function(s) {
  df <- r.list[[s]] %>% as.data.frame()
  names(df) <- names(df) %>% gsub(paste0("_", s), "", .)
  df %>% setDT()
  df[, state := factor(s, levels=states)]
  df[apply(df, 1, function(.x) !any(is.na(.x)))]
}) 

# Custom function to process factor levels
clean.levels <- function(x) {
  # Remove non-alphanumeric characters and replace with underscores
  x <- gsub("[^a-zA-Z0-9]", "_", x)
  # Convert to lowercase
  x <- tolower(x)
  # Remove any leading or trailing underscores
  x <- gsub("^_|_$", "", x)
  x <- gsub("__", "_", x)
  x <- gsub("NLCD_Land_", "", x)
  return(x)
}

r.df[, NLCD_Land := factor(clean.levels(levels(NLCD_Land))[NLCD_Land])]

# Convert factor columns to dummy variables
df.dummies <- data.table(model.matrix(~ . - 1, 
                                      data = r.df[, .(NLCD_Land, state)])) %>%
  cbind(r.df[, -which(names(r.df) %in% c("NLCD_Land", "state")), with=F]) 

names(df.dummies) <- gsub("NLCD_Land", "", names(df.dummies))

# Ensure that there is more than one value per column (remove otherwise)
uniq.1 <- t( df.dummies[, lapply(.SD, uniqueN)]) %>%
  as.data.frame() %>%
  filter(V1 == 1) %>%
  row.names()

if (length(uniq.1) >= 1) {
  df.dummies <- df.dummies[, -which(names(df.dummies) %in% uniq.1), with=F]
}


pca.fit <- PCA(df.dummies, graph=F)
plot.PCA(pca.fit, choix="var")

res <- pca.fit$var$coord %>%
  as.data.frame() %>%
  mutate(var=as.factor(rownames(.))) %>%
  select(var, everything()) %>%
  as_tibble()
rownames(res) <- NULL
  
p.d1 <- ggplot(res, aes(x = reorder(var, Dim.1), y = Dim.1)) +
  geom_bar(stat = "identity", fill="darkblue") +
  coord_flip() +  # Makes it a horizontal bar chart
  labs(title = "Variable importance for Dim.1", y = "Importance", x = "Variable") +
  theme_minimal()

p.d2 <- ggplot(res, aes(x = reorder(var, Dim.2), y = Dim.2)) +
  geom_bar(stat = "identity", fill="darkred") +
  coord_flip() +  # Makes it a horizontal bar chart
  labs(title = "Variable importance for Dim.2", y = "Importance", x = "Variable") +
  theme_minimal()

ggpubr::ggarrange(plotlist=list(p.d1, p.d2), nrow=2, ncol=1)

Factor Analysis for Mixed Data

Diving deeper into the structure of the data, Factor Analysis for Mixed Data (FAMD) is implemented next. FAMD is a unique technique that extends traditional Factor Analysis by handling both continuous and categorical data, which fits the nature of the dataset used in this study. The code initiates this process by applying the FAMD() function to the dataframe. The results are then visualized in three distinct plots that highlight the variance explained by different variables for both continuous and categorical data. These plots provide insights into the underlying structure of the data and the significance of each variable. By observing the variable importance for the first two dimensions, displayed through bar charts, it can be discern which variables are related. Later, multi-collinearity and autocorrelation will be addressed in greater depth.

famd.fit <- FAMD(r.df, graph=F)

ggpubr::ggarrange(plotlist=purrr::map(
  c("var", "quanti", "quali"), 
  ~plot.FAMD(famd.fit, choix=.x)),
  ncol=1, nrow=3)

res <- famd.fit$var$coord %>%
  as.data.frame() %>%
  mutate(var=as.factor(rownames(.))) %>%
  select(var, everything()) %>%
  as_tibble()
rownames(res) <- NULL
  
p.d1 <- ggplot(res, aes(x = reorder(var, Dim.1), y = Dim.1)) +
  geom_bar(stat = "identity", fill="darkblue") +
  coord_flip() +  # Makes it a horizontal bar chart
  labs(title = "Variable importance for Dim.1", y = "Importance", x = "Variable") +
  theme_minimal()

p.d2 <- ggplot(res, aes(x = reorder(var, Dim.2), y = Dim.2)) +
  geom_bar(stat = "identity", fill="darkred") +
  coord_flip() +  # Makes it a horizontal bar chart
  labs(title = "Variable importance for Dim.2", y = "Importance", x = "Variable") +
  theme_minimal()

ggpubr::ggarrange(plotlist=list(p.d1, p.d2), nrow=2, ncol=1)

Pseudo-Absence Generation

In species distribution modeling, the presence of a species in certain locations is often well-recorded, but the absence is typically under-reported or not reported at all. This creates a challenge when trying to understand the complete distribution of a species. To address this, the concept of pseudo-absence data is introduced. Pseudo-absence data are artificially generated points that represent locations where the species is assumed not to be present. In this section, pseudo-absence data points are generated for the eight bird species across the four states ensuring that these points do not overlap with observed presence data. The generated pseudo-absence data helps in providing a more comprehensive view of the species distribution and serves as an essential component for the upcoming modeling process.

Buffering Raster Data

For the method used for generating pseudo-absence data in this analysis, initially a buffer is created around each observation point, each with a 5000 meter radius. This buffer essentially “blocks” the regions of the sample-area from where pseudo-absence points can be sampled. I.e., only the non-buffered zones can be sampled from.

# Set up output directory
output.dir <- "artifacts/masks_5k"
if (!dir.exists("artifacts")) dir.create("artifacts")
if (!dir.exists(output.dir)) dir.create(output.dir)

# Load Data

# Get raster data
states <- c("CO", "NC", "OR", "VT")
r.list <- purrr::map(paste0("data/final_rasters/", states, ".tif"), rast)
names(r.list) <- states

# Get observation data
obs.df <- list.files("data/final", full.names = T) %>%
  purrr::map_df(readRDS) %>%
  select(state, common.name, observation.point=geometry)

# Buffering Raster Data

dist <- 5e3

mask.update <- function(i, mask.raster, obs.df, obs.field="observation.point",
                        dist=10000, u="m") {
  obs.pt <- st_transform(obs.df[i, "observation.point"], st_crs(mask.raster))
  # Create a buffer around the point, ensuring correct units
  buf <- st_buffer(obs.pt, dist=units::as_units(paste(dist, u)))
  return(terra::rasterize(buf, mask.raster, update=T, field=1))
}

# For each observation point, you can now create a distance 
# raster and then mask cells within the buffer distance
get.buffered.zones <- function(r, obs.df, obs.field="observation.point",
                               dist=10000, u="m") {
  # Create an empty raster with the same extent and resolution as r
  mask.raster <- terra::rast(r)
  # Recursively update mask.raster with additional buffered regions
  for(i in 1:nrow(obs.df)) {
    mask.raster <- mask.update(i, mask.raster, obs.df, 
                               obs.field=obs.field, 
                               dist=dist, u=u)
    gc()
  }
  return(mask.raster)
}

# Get masks by state, species
masks <- purrr::map(states, function(state) {
    specs <- sort(unique(obs.df$common.name))
    spec.masks <- purrr::map(specs, function(spec, st=state) {
      fname <- file.path(output.dir, paste0(st, "_", spec, ".tif"))
      if (file.exists(fname)) {
        cat("Reading", spec, st, "mask from", fname, "\n")
        r.mask <- rast(fname)
      } else {
        cat("Computing", spec, st, "mask, and saving to", fname, "\n")
        r.mask <- get.buffered.zones(r=r.list[[st]], 
                                     obs.df=filter(obs.df, state == st & 
                                                     common.name == spec),
                                     dist=dist)
        terra::writeRaster(r.mask, fname, overwrite=T)
      }
      gc()
      r.mask
    }, .progress=T)
    names(spec.masks) <- specs
    spec.masks
  })
names(masks) <- states

Sampling Pseudo-Absences and Comparing Totals

This section identifies areas outside the buffers as potential zones for pseudo-absences. To ensure an accurate representation, a random sampling mechanism is implemented. For each bird species in a state, an equivalent number of pseudo-absence points (or a pre-defined minimum threshold), based on observed data are generated. The result is a set of coordinates representing regions where the bird species have not been observed.

# Set seed
set.seed(19)

# Function to sample n points from the non-masked parts
sample.inverse.mask <- function(r.original, r.mask, n, 
                                species, state,
                                sample.crs=4326, min.n=500,
                                output.dir="artifacts/pseudo_absence_regions") {
  if (!dir.exists(output.dir)) dir.create(output.dir)
  output.path <- file.path(output.dir,
                           gsub(" |\\-", "_", 
                                paste0(
                                  paste(state, tolower(species), sep="_"), 
                                  ".tif")
                           ))
  if (!file.exists(output.path)) {
    # Get inverse mask;
    # Set NA cells to 0, keep 0 cells as 0, change other cells to 1
    r.inverse <- terra::ifel(is.na(r.mask), 0, r.mask)
    # Set 0 to 1 and everything else to NA
    r.inverse <- terra::lapp(r.inverse, fun = function(x) ifelse(x == 0, 1, NA))
    # Crop so that anything outside of the state is NA
    r.cropped <- terra::crop(r.inverse, terra::ext(r.original))
    
    # Create a binary raster from r.original where valid values are 
    # set to 1 and NA values remain NA
    r.binary <- terra::lapp(r.original[[1]], 
                            fun = function(x) ifelse(!is.na(x), 1, NA))
    
    # Multiply the cropped raster by the binary raster to ensure 
    # outside values are set to NA
    r.final <- r.cropped * r.binary
    terra::writeRaster(r.final, output.path, overwrite=T)
  } else {
    r.final <- terra::rast(output.path)
  }
  
  # Convert the raster to SpatialPoints
  gdf <- terra::as.points(r.final) %>%
    st_as_sf() %>%
    st_transform(crs=sample.crs)
  if (nrow(gdf) > 0) {
    gdf <- gdf %>%
      filter(!is.na(layer)) %>%
      select(geometry)
  } else {
    return(gdf)
  }
  
  # Set to min.n size if n < min.n
  if (n < min.n) n <- min.n
  # Make sure there is sufficient available sample points
  if (n > nrow(gdf)) n <- nrow(gdf)
  
  # Randomly sample n points from the available (non-masked) space
  sample.idx <- sample(1:nrow(gdf), n)
  samples <- gdf[sample.idx,] %>%
    mutate(common.name = species, 
           state = state, 
           lon = NA, 
           lat = NA,
           observations=0)
  
  # Populate lon and lat values:
  coords <- st_coordinates(samples)
  samples$lon <- coords[, "X"]
  samples$lat <- coords[, "Y"]
  
  return(samples)
}

# Get totals by species and state
totals <- obs.df %>%
  as_tibble() %>%
  select(state, common.name) %>%
  group_by(state, common.name) %>%
  summarize(N=n(), .groups="keep")


if (!dir.exists(file.path("data", "final_pseudo_absence"))) {
  dir.create(file.path("data", "final_pseudo_absence"))
}

if (!all(
  file.exists(
    paste0(file.path("data", "final_pseudo_absence", 
                     paste0("pa_", states, ".rds")))
  )
)) {
  # Create a list of pseudo absence points, by species and state,
  # where the sample number `n` >= 500 | `n` == the total observed
  # for each respective state and species
  pseudo.absence.pts <- list()
  for (st in states) {
    r.original <- r.list[[st]]
    r.masks <- masks[[st]]
    pseudo.absence.pts[[st]] <- list()
    for (spec in names(r.masks)) {
      r.mask <- r.masks[[spec]]
      n <- totals %>% filter(state == st & common.name == spec) %>% pull(N)
      cat("Generating", n, "pseudo-absence points for the", spec, "in", st, "\n")
      pseudo.absence.pts[[st]][[spec]] <- sample.inverse.mask(
        r.original, r.mask, spec, st, n=n, sample.crs=4326)
      cat("\tGenerated", nrow(pseudo.absence.pts[[st]][[spec]]), "/", n, "points.\n")
    }
  }
  
  # Extract raster values for each point
  for (state in states) {
    out.file.all <- file.path("data", "final_pseudo_absence", paste0("pa_", state, ".rds"))
    if (!file.exists(out.file.all)) {
      r <- r.list[[state]]
      
      cat(sprintf("Extracting points to values for %s...\n", state))
      # Load observations shapefile
      geo.df <- pseudo.absence.pts[[state]] %>% do.call("rbind", .)
      rownames(geo.df) <- NULL
      
      geo.df.crs <- st_crs(geo.df)
      
      # Define target CRS and update
      target.crs <- st_crs(r)
      cat(sprintf("Updating CRS for %s...\n", state))
      geo.df <- st_transform(geo.df, target.crs)
      
      # Extract raster values
      for (r.name in names(r)) {
        cat("\tExtracting", r.name, "values for", state, "\n")
        x <- terra::extract(r[[r.name]], geo.df)[[r.name]]
        geo.df[[gsub(paste0("_", state), "", r.name)]] <- x
      }
      
      # Update crs back
      geo.df <- st_transform(geo.df, geo.df.crs)
      
      # Fix names; Filter NA values
      geo.df <- geo.df %>%
        filter(dplyr::if_all(names(.), ~!is.na(.))) %>%
        suppressWarnings() 
      
      saveRDS(geo.df, out.file.all)
      cat("--------------\n")
    }
  }
}

The table below compares the generated pseudo-absence data with the observed dataset. This comparison ensures that the number of pseudo-absence points matches the observed ones, balancing the dataset and making it ready for the next analysis steps.

# Get all pseudo-absence data
abs.df <- list.files(file.path("data", "final_pseudo_absence"), full.names = T) %>%
  purrr::map_df(readRDS) %>%
  select(state, common.name, observation.point=geometry)

# There might be some slight differences since there are occasionally NA values
abs.df %>%
  as_tibble() %>%
  select(state, common.name) %>%
  group_by(state, common.name) %>%
  summarize(psuedo.absence.N=n(), .groups="keep") %>%
  left_join(totals, by=c("state", "common.name")) %>%
  rename(observed.N = N) %>%
  knitr::kable()
state common.name psuedo.absence.N observed.N
CO Belted Kingfisher 4523 4551
CO Cedar Waxwing 3431 3446
CO Downy Woodpecker 7440 7511
CO Ruddy Duck 1715 1726
CO Sanderling 497 131
CO Sandhill Crane 1524 1532
CO Sharp-shinned Hawk 2241 2257
CO Wild Turkey 2597 2611
NC Belted Kingfisher 4042 4183
NC Cedar Waxwing 4059 4195
NC Downy Woodpecker 10415 10914
NC Ruddy Duck 1076 1106
NC Sanderling 488 311
NC Sandhill Crane 489 118
NC Sharp-shinned Hawk 1211 1254
NC Wild Turkey 2261 2372
OR Belted Kingfisher 5803 5837
OR Cedar Waxwing 8405 8446
OR Downy Woodpecker 8529 8576
OR Ruddy Duck 1996 2010
OR Sanderling 496 258
OR Sandhill Crane 2443 2458
OR Sharp-shinned Hawk 2696 2714
OR Wild Turkey 2429 2440
VT Belted Kingfisher 1956 2033
VT Cedar Waxwing 1098 3938
VT Downy Woodpecker 1598 4635
VT Ruddy Duck 490 51
VT Sanderling 493 39
VT Sandhill Crane 492 76
VT Sharp-shinned Hawk 730 748
VT Wild Turkey 2090 2181

Identifying Spatial Autocorrelation

Moran’s I

This is a common measure of global spatial autocorrelation. A positive Moran’s I suggests clustering, a negative value suggests dispersion, and a value near zero suggests randomness. In other words, this is randomization approach to test for spatial autocorrelation. It is essentially checking if the data has a spatial pattern that is significantly different from what would be expected if the values were randomly distributed in space.

Understanding the Results

Moran I statistic: This is the calculated Moran’s I value for the data, which quantifies the degree of spatial autocorrelation. A positive value indicates positive autocorrelation (similar values are closer together), while a negative value indicates negative autocorrelation (dissimilar values are closer together). A value close to zero indicates a random spatial pattern.

Moran I statistic standard deviate: This value represents the standardized Moran’s I value. The larger the absolute value of this statistic, the stronger the evidence against the null hypothesis (of no spatial autocorrelation). A positive value indicates positive spatial autocorrelation, suggesting clustering of similar values.

P-value: This is the probability of observing a Moran’s I value as extreme as, or more extreme than, the one computed from the data, assuming the null hypothesis of no spatial autocorrelation is true. A very small p-value provides strong evidence to reject the null hypothesis, indicating significant spatial autocorrelation in your data.

Expectation: This is the expected Moran’s I value under the null hypothesis of no spatial autocorrelation. For a large dataset, it’s typically close to zero.

Variance: This is the variance of the Moran’s I statistic under the null hypothesis.

Spatial Autocorrelation of Observations

if (!dir.exists("artifacts/obs_autocor_morans")) {
  dir.create("artifacts/obs_autocor_morans")
}

# Get observation data
df <- list.files(file.path("data", "final"), full.names = T) %>%
  purrr::map_df(readRDS) %>%
  select(state, common.name, observations, geometry) # We only interested in these data here

states <- sort(unique(df$state))
species <- sort(unique(df$common.name))

# Function to extract the desired results from output
extract.data <- function(st, spec, results) {
  data_frame(
    state = st,
    species = spec,
    statistic = results$statistic,
    p.value = results$p.value,
    moran.i.statistic = results$estimate['Moran I statistic'],
    expectation = results$estimate['Expectation'],
    variance = results$estimate['Variance']
  )
}

# Define the k for knn computation
k <- 5

if (!file.exists("artifacts/obs_autocor_morans/mi.results.rds")) {
  # Perform test by state, by species
  mi.results <- list()
  
  for (st in states) {
    mi.results[[st]] <- list()
    for (spec in species) {
      cat("Doing Moran's test for", spec, "in", st, "\n")
      # Filter data
      d <- df %>% filter(common.name == spec & state == st)
      mi.results[[st]][[spec]]$data <- d
      # Fit knn model
      knn <- d %>%
        knearneigh(k = k)
      mi.results[[st]][[spec]]$knn <- knn
      # Create a neighbor's list
      nb <- knn %>%
        knn2nb()
      mi.results[[st]][[spec]]$nb <- nb
      # Create a spatial weights matrix
      listw <- nb2listw(nb)
      mi.results[[st]][[spec]]$listw <- listw
      # Compute Moran's I
      results <- moran.test(d$observations, listw)
      mi.results[[st]][[spec]]$moran.test.results <- extract.data(st, spec, results)
    }
  }
  saveRDS(mi.results, "artifacts/obs_autocor_morans/mi.results.rds")
} else {
  mi.results <- readRDS("artifacts/obs_autocor_morans/mi.results.rds")
}

spec.stat <- expand.grid(species=species, state=states, stringsAsFactors=F)

mi.results.df <- purrr::map(1:nrow(spec.stat), function(i) {
    spec <- spec.stat[i, ]$species
    st <- spec.stat[i, ]$state
    mi.results[[st]][[spec]]$moran.test.results
  }) %>% 
  do.call("rbind", .) %>%
  as_tibble() %>%
  # Compute adjusted p-value, accounting for multiple comparisons
  mutate(adj.p.value = p.adjust(p.value, method="holm"))
# Filter to where the adjusted p-value <= 0.05; sort by moran i stat
mi.results.df %>% 
  filter(adj.p.value <= 0.05) %>%
  arrange(-moran.i.statistic)

The Moran’s I statistic for the Belted Kingfisher and Cedar Waxwing are positive in Oregon, as well as the Sharp-shinned Hawk in both Vermont and Oregon. This suggests that locations with high observations of these species are close to other locations with high observations, and the same for low observations. In simple terms, the observation patterns of these species are clustered.

Below is an example of the actual observations of the Sharp-shinned Hawk in Vermont against the spatial lag (i.e., the weighted sum of the neighboring values at each point). Consider the following when interpreting the plot:

  • First Quadrant (top-right): High values surrounded by high values (indicating clustering of high values).
  • Second Quadrant (top-left): Low values surrounded by high values.
  • Third Quadrant (bottom-left): Low values surrounded by low values (indicating clustering of low values).
  • Fourth Quadrant (bottom-right): High values surrounded by low values.
# Get spatial weights list
vt.ssh <- mi.results[["VT"]][["Sharp-shinned Hawk"]]$data

# Calculate the spatial lag of the observations
vt.ssh$spatial.lag <- lag.listw(
  mi.results[["VT"]][["Sharp-shinned Hawk"]]$listw,
  vt.ssh$observations)

# Scatter plot for the Sharp-Shinned Hawk in Vermont
ggplot(vt.ssh, aes(x=observations, y=spatial.lag)) +
  geom_point() +
  geom_smooth(method="lm", col="red") + # Adds a linear regression line
  ggtitle("Moran Scatter Plot for Sharp-shinned Hawk in VT") +
  xlab("Observations (log10 Scale)") +
  ylab("Spatial Lag of Observations (log10 Scale)") +
  scale_y_log10() + scale_x_log10()

Spatial Autocorrelation of Environmental Factors

env.df.list <- list()
for (state in states) {
  r <- r.list[[state]]
  
  gdf <- terra::as.points(r) %>% 
    st_as_sf() %>%
    st_transform(crs=4326)
  # Fix names
  names(gdf) <- gsub(paste0("_", state, "$"), "", names(gdf))
  
  # Convert land cover to binary variables
  binary.lc <- caret::dummyVars(~NLCD_Land, data=gdf, sep=".") %>% 
    predict(., gdf) %>%
    as.data.frame()
  names(binary.lc) <- gsub("NLCD_Land", "", names(binary.lc)) %>% 
    gsub("\\/| |,", "_", .) %>% 
    gsub("__", "_", .) %>% 
    tolower()
  
  gdf <- gdf %>% select(-NLCD_Land) %>% cbind(binary.lc)
  
  env.df.list[[state]] <- gdf
}

# Perform test by state, by environmental variable
env.mi.results <- list()

output.dir <- file.path("artifacts", "env_autocor_morans")
if (!dir.exists(output.dir)) dir.create(output.dir)

for (st in states) {
  env.mi.results[[st]] <- list()
  gdf <- env.df.list[[st]]
  env.vars <- names(gdf)[names(gdf) != "geometry"]
  for (e in env.vars) {
    output.path <- file.path(output.dir, paste0(st, "_", e, ".rds"))
    if (!file.exists(output.path)) {
      cat("Doing Moran's test for", e, "in", st, "\n")
      # Filter data
      d <- gdf %>% select(!!sym(e))
      # env.mi.results[[st]][[e]]$data <- d
      n.distinct <- d %>% pull(!!sym(e)) %>% unique() %>% length()
      if (n.distinct > 1) {
        # Fit knn model
        knn <- d %>%
          knearneigh(k = k)
        # env.mi.results[[st]][[e]]$knn <- knn
        # Create a neighbor's list
        nb <- knn %>%
          knn2nb()
        # env.mi.results[[st]][[e]]$nb <- nb
        # Create a spatial weights matrix
        listw <- nb2listw(nb)
        # env.mi.results[[st]][[e]]$listw <- listw
        # Compute Moran's I
        results <- moran.test(d[[e]], listw)
        env.mi.results[[st]][[e]]$moran.test.results <- extract.data(st, e, results)
      }
      cat("\tSaving results...\n")
      saveRDS(env.mi.results[[st]][[e]]$moran.test.results, output.path)
    } else {
      cat("Getting Moran's test results for", e, "in", st, "\n")
      env.mi.results[[st]][[e]]$moran.test.results <- readRDS(output.path)
    }
  }
}


env.stat <- expand.grid(env=names(env.df.list$CO )[names(env.df.list$CO) != "geometry"], 
                        state=states, stringsAsFactors=F)

env.mi.results.df <- purrr::map(1:nrow(env.stat), function(i) {
    e <- env.stat[i, ]$env
    st <- env.stat[i, ]$state
    env.mi.results[[st]][[e]]$moran.test.results
  }) %>% 
  do.call("rbind", .) %>%
  as_tibble() %>%
  # Compute adjusted p-value, accounting for multiple comparisons
  mutate(adj.p.value = p.adjust(p.value, method="holm"))
# Filter to where the adjusted p-value <= 0.05; sort by moran i stat
env.mi.results.df %>% 
  filter(adj.p.value <= 0.05) %>%
  arrange(-moran.i.statistic)

It is apparent that many of the environmental variables exhibit strong spatial clustering patterns. Notably, the weather layers have Moran’s I values nearing 1, signifying near-perfect spatial autocorrelation. Such patterns, while anticipated for certain types of variables, can introduce challenges in statistical analyses. Specifically, strong spatial autocorrelation often violates the assumption of observation independence fundamental to many traditional statistical models, potentially leading to biased parameter estimates and misleading significance levels. However, this spatial dependence can be accounted for using specialized spatial statistical methods or integrated as inherent structure in certain modeling techniques. It’s crucial to select an appropriate modeling method that accommodates or leverages this spatial structure, ensuring reliable and valid results.

Mitigating Potential Problems Due to Spatial Autocorrelation

  1. Multi-Collinearity
  • Problem: When predictors are spatially autocorrelated, they may also be correlated with each other. This multi-collinearity can make it hard to determine the influence of individual predictors on the response variable.
  • Solution: One method of detecting multi-collinearity is to use the condition index derived from the eigenvalues of a correlation matrix. Variables with high condition indices can be further examined through their contributing eigenvectors to identify key contributors. Furthermore, visualizing correlation matrices can help provide a clearer understanding. Based on these insights, dropping variables, combining variables, or applying dimension reduction techniques can help mitigate mutli-collinearity between features.
  1. Spatial Autocorrelation in Residuals
  • Problem: Spatially autocorrelated residuals indicate that the model hasn’t captured all spatial structures, leading to biased parameter estimates.
  • Solution: Employ spatial regression models that can incorporate spatial structure, such as spatial lag or spatial error models. Alternatively, add spatially structured random effects to more standard models.
  1. Overfitting
  • Problem: Models that capture too much of the spatial structure in the predictors might perform exceptionally well on training data but poorly on validation or future data.
  • Solution: Use spatial cross-validation to ensure models generalize well. Regularization techniques, such as Ridge or LASSO regression, can also help prevent overfitting.
  1. Non-Stationarity

Multi-collinearity is addressed in the following section, while the other mitigation solutions will be implemented either during the modeling stage of the study (e.g., cross-validation), or else addressed during the feature engineering/selection portion.

Checking for Multi-Collinearity in Environmental Factors

all.columns <- unique(unlist(lapply(env.df.list, names)))
env.df <- lapply(env.df.list, function(df) {
  missing.cols <- setdiff(all.columns, names(df))
  for (col in missing.cols) {
    df[[col]] <- NA
  }
  return(df)
}) %>%
  do.call("rbind", .) %>%
  as.data.frame() %>%
  select(-geometry, -unclassified)

corr.matrix <- cor(env.df, use="na.or.complete")
eigenvalues <- eigen(corr.matrix)$values
ci.df <- tibble(
  variable=names(env.df),
  condition.index=sqrt(max(eigenvalues) / eigenvalues)
)

ci.df
# Extract the eigenvectors
eigenvectors <- eigen(corr.matrix)$vectors

threshold <- 30
large.ci.indices <- which(sqrt(max(eigenvalues) / eigenvalues) > threshold)

# Examine the eigenvectors
for (index in large.ci.indices) {
  cat(paste("Eigenvalue:", eigenvalues[index]), "\n")
  cat(paste("Condition Index:", 
                  sqrt(max(eigenvalues) / eigenvalues[index])), "\n")
  
  # Sorting eigenvector components by absolute magnitude for clarity
  ev <- eigenvectors[, index]
  sorted.ev <- sort(abs(ev), decreasing = T)
  ev.top <- sorted.ev[1:5] %>%
    as_tibble() %>%
    mutate(variable=rownames(corr.matrix)[order(-abs(ev))][1:5]) %>%
    select(variable, value)
  
  cat("\nTop 5 contributors to multicollinearity at the condition idx:\n")
  
  for (row in 1:5) {
    cat("\t", ev.top[row,]$variable, ":", signif(ev.top[row,]$value, 3), "\n")
  }
  
  cat("\n------------------------\n")
}
## Eigenvalue: 0.00135666182336175 
## Condition Index: 65.3042479620253 
## 
## Top 5 contributors to multicollinearity at the condition idx:
##   avg_prcp : 0.814 
##   tmax : 0.459 
##   tmin : 0.356 
##   dem : 0.0108 
##   Winter_NDVI : 0.00657 
## 
## ------------------------
## Eigenvalue: 1.97064586234132e-15 
## Condition Index: 54184234.4382929 
## 
## Top 5 contributors to multicollinearity at the condition idx:
##   shrub_scrub : 0.508 
##   evergreen_forest : 0.5 
##   herbaceous : 0.473 
##   cultivated_crops : 0.318 
##   deciduous_forest : 0.207 
## 
## ------------------------
corrplot::corrplot(corr.matrix)

Feature Engineering

As described previously, feature engineering is an essential step in predictive modeling, as it can help to reduce possible problems due to multi-collinearity or spatial autocorrelation. In addition, it can help reduce overfitting, as well as provide new insights unavailable through basic pre-processing of the data.

Setup

First, create a space to output any “engineered” rasters:

output.dir <- "artifacts/feature_engineering"
if (!dir.exists(output.dir)) dir.create(output.dir)

Use Hierarchical Categories for Land Cover

Recall the hierarchical categories for the land cover data:

  • Water:
    • 11: Open Water
    • 12: Perennial Ice/Snow
  • Developed
    • 21: Developed, Open Space
    • 22: Developed, Low Intensity
    • 23: Developed, Medium Intensity
    • 24: Developed, High Intensity
  • Barren
    • 31: Barren Land (Rock/Sand/Clay)
  • Forest
    • 41: Deciduous Forest
    • 42: Evergreen Forest
    • 43: Mixed Forest
  • Shrubland
    • 51: Dwarf Shrub
    • 52: Shurb/Scrub
  • Herbaceous
    • 71: Grassland/Herbaceous
    • 72: Sedge/Herbaceous
    • 73: Lichens
    • 74: Moss
  • Planted/Cultivated
    • 81: Pasture/Hay
    • 82: Cultivated Crops
  • Wetlands
    • 90: Woody Wetlands
    • 95: Emergent Herbaceous Wetlands

Using these categories, feature reduction can be accomplished using a heuristic methodology. Other redundant rasters, such as the Waterbody and Urban Imperviousness rasters, can also be combined with the respective related land cover rasters to further reduce multicollinearity between rasters.

create.parent.category.rasters <- function(input.raster, 
                                           state, 
                                           output.dir,
                                           layer.name="NLCD_Land") {
  # Define the category mappings
  categories <- list(
    water = c("Open Water"),
    ice.snow = c("Perennial Ice/Snow"),
    developed = c("Developed, Open Space",
                  "Developed, Low Intensity",
                  "Developed, Medium Intensity",  
                  "Developed, High Intensity"),
    barren = c("Barren Land"),
    forest = c("Mixed Forest", 
               "Deciduous Forest", 
               "Evergreen Forest"),
    shrubland = c("Shrub/Scrub", "Dwarf Shrub"),
    herbaceous = c("Herbaceous", "Grassland/Herbaceous",
                   "Sedge/Herbaceous", "Lichens", "Moss"),
    planted.cultivated = c("Cultivated Crops", "Hay/Pasture"),
    wetlands = c("Woody Wetlands", "Emergent Herbaceous Wetlands")
  )
  
  # Iterate through each category to create and save the binary raster
  for (cat.name in names(categories)) {
    # Define the output file path
    output.file <- file.path(output.dir, paste0(state, "_", cat.name, ".tif"))
    
    if (!file.exists(output.file)) {
      cat("Generating raster for", cat.name, "in", state, "\n")
      if (cat.name != "developed") {
        # Create a binary raster for the current category
        out.raster <- terra::lapp(input.raster[[layer.name]], 
                                  fun = function(x) {
                                    case_when(
                                      is.na(x) ~ NA,
                                      x %in% categories[[cat.name]] ~ 1.,
                                      T ~ 0)
                                  })
        names(out.raster) <- cat.name
        if (cat.name == "water") {
          # Combine with waterbody raster layer
          wb.raster <- input.raster[[paste0("waterbody_", state)]]
          out.raster <- (out.raster + wb.raster) / 2
          names(out.raster) <- "water"
        }
      } else {
        # Set developed raster to be a value, scale ranging from 0.25-1 by 0.25
        out.raster <- terra::lapp(input.raster[[layer.name]], 
                                  fun = function(x) {
                                    case_when(
                                      is.na(x) ~ NA,
                                      x == "Developed, Open Space" ~ 0.25,
                                      x == "Developed, Low Intensity" ~ 0.5,
                                      x == "Developed, Medium Intensity" ~ 0.75,
                                      x == "Developed, High Intensity" ~ 1.,
                                      T ~ 0)
                                  })
        # Combine with urban imperviousness
        ui.raster <- input.raster[[paste0("urban_imperviousness_", state)]]
        ui.min <- terra::minmax(ui.raster)[[1]]
        ui.max <- terra::minmax(ui.raster)[[2]]
        ui.raster.scaled <- (ui.raster - ui.min) / (ui.max - ui.min)
        out.raster <- (out.raster + ui.raster.scaled) / 2
        names(out.raster) <- "developed"
      }
      
      # Save the output raster to the specified output path
      writeRaster(out.raster, output.file, overwrite=T)
    }
  }
}

for (state in states) {
  create.parent.category.rasters(r.list[[state]], state, output.dir)
}

Aggregate Seasonal NDVI Rasters (Min/Max)

NDVI, or Normalized Difference Vegetation Index, is a common measure of vegetation health. For each state, there are four separate NDVI rasters representing different seasons. Aggregating these rasters provides two essential insights: the minimum and maximum NDVI values. The minimum NDVI may correspond to the period when vegetation is least vigorous (e.g., winter or drought), while the maximum may indicate the period of peak vegetation growth (e.g., spring or summer). By scaling these values between 0 and 1, and then storing them separately as minimum and maximum NDVI rasters, a standardized measure of vegetation dynamics throughout the year for each state is achieved.

# Convert 4 separate NDVI rasters to a single raster
for (state in states) {
  output.file.min <- file.path(output.dir, paste0(state, "_min_NDVI.tif"))
  output.file.max <- file.path(output.dir, paste0(state, "_max_NDVI.tif"))
  if (!file.exists(output.file.min) | !file.exists(output.file.max)) {
    r.ndvi <- r.list[[state]][[names(r.list[[state]]) %like% "NDVI"]]
    # Parse seasons
    for (s in names(r.ndvi)) {
      r <- r.ndvi[[s]]
      .min <- terra::minmax(r)[[1]]
      .max <- terra::minmax(r)[[2]]
      # Scale to be from 0 to 1
      r.ndvi[[s]] <- (r - .min) / (.max - .min)
    }
    # Get min/max scaled NDVI values
    r.max <- max(r.ndvi)
    names(r.max) <- "max.ndvi"
    r.min <- min(r.ndvi)
    names(r.min) <- "min.ndvi"
    writeRaster(r.max, output.file.max, overwrite=T)
    writeRaster(r.min, output.file.min, overwrite=T)
  }
}

Combine Temperature Rasters (Difference)

In this section, the difference between the maximum and minimum temperatures for each state is calculated. This temperature difference, often referred to as diurnal temperature range, can provide insights into the variability of daily temperatures. A high difference may suggest drastic temperature changes from day to night, which can impact ecosystems and human health. By creating a scaled raster of these differences (ranging from 0 to 1), the temperature variability can be compared across different states.

# Get the difference between the min and max temperatures as a raster
for (state in states) {
  output.file <- file.path(output.dir, paste0(state, "_tdiff.tif"))
  if (!file.exists(output.file)) {
    # Get difference
    r.tdiff <- r.list[[state]][[paste0("tmax_", state)]] - 
      r.list[[state]][[paste0("tmin_", state)]]
    # Get min/max differences
    .min <- terra::minmax(r.tdiff)[[1]]
    .max <- terra::minmax(r.tdiff)[[2]]
    # Scale to be from 0 to 1
    r.tdiff <- (r.tdiff - .min) / (.max - .min)
    names(r.tdiff) <- "temp.diff"
    # Write raster
    writeRaster(r.tdiff, output.file, overwrite=T)
  }
}

Spatial Filters

These are components derived from the spatial coordinates, which can capture and account for spatial structures in the data. E.g., a polynomial term based on latitude and longitude could account for some of the spatial trend.

for (state in states) {
  output.file.lon <- file.path(output.dir, paste0(state, "_lon.tif"))
  output.file.lat <- file.path(output.dir, paste0(state, "_lat.tif"))
  output.file.lon2 <- file.path(output.dir, paste0(state, "_lon_poly.tif"))
  output.file.lat2 <- file.path(output.dir, paste0(state, "_lat_poly.tif"))
  output.file.lli <- file.path(output.dir, paste0(state, "_lon_lat_interaction.tif"))
  if (!all(file.exists(c(output.file.lon, output.file.lat, 
                         output.file.lon2, output.file.lat2,
                         output.file.lli)))) {
    # Get raster as df
    r.df <- terra::as.data.frame(r.list[[state]], xy=T, cells=T) %>% 
      rename(lon=x, lat=y) %>%
      select(lon, lat, cell) %>%
      st_as_sf(coords = c("lon", "lat"), crs = st_crs(r.list[[state]])) %>%
      st_transform(crs=4326) %>%
      cbind(st_coordinates(.)) %>%
      rename(lon="X", lat="Y")
    
    # Initialize empty raster templates
    r.lat <- ext(r.list[[state]]) %>% 
            rast(res=res(r.list[[state]]), crs=crs(r.list[[state]]))
    r.lon <- ext(r.list[[state]]) %>% 
            rast(res=res(r.list[[state]]), crs=crs(r.list[[state]]))
    # Fill with lat/lon values
    r.lat[r.df$cell] <- r.df$lat
    names(r.lat) <- "lat"
    r.lon[r.df$cell] <- r.df$lon
    names(r.lon) <- "lon"
    # Get polynomial and interaction terms
    r.lat2 <- r.lat^2 
    names(r.lat2) <- "lat.sqrd"
    r.lon2 <- r.lon^2 
    names(r.lon2) <- "lon.sqrd"
    r.lon.lat <- r.lon * r.lat
    names(r.lon.lat) <- "lon.lat.interaction"
    
    # Write outputs
    writeRaster(r.lat, output.file.lat, overwrite=T)
    writeRaster(r.lon, output.file.lon, overwrite=T)
    writeRaster(r.lat2, output.file.lat2, overwrite=T)
    writeRaster(r.lon2, output.file.lon2, overwrite=T)
    writeRaster(r.lon.lat, output.file.lli, overwrite=T)
  }
}

De-trend DEM

Digital Elevation Models (DEM) give us a detailed overview of the terrain elevation. However, sometimes, larger spatial trends (e.g., the general increase in elevation from the coast to the mountains) can overshadow more localized features or variations. By de-trending the DEM, the underlying larger spatial trend is removed, highlighting only the local elevation differences. In the given section, a linear model is fitted to the DEM using latitude and longitude (and their polynomial terms) to capture the broader spatial trends. The residuals from this model represent the local variations in elevation, devoid of the overarching trend. These de-trended values are then scaled between 0 and 1 to create a standardized representation of the localized elevation changes for each state.

for (state in states) {
  output.file <- file.path(output.dir, paste0(state, "_detrend_dem.tif"))
  if (!file.exists(output.file)) {
    # Get raster as df
    r.df <- terra::as.data.frame(r.list[[state]], xy=T, cells=T) %>% 
      rename(lon=x, lat=y) %>%
      select(lon, lat, cell, !!sym(paste0("dem_", state))) %>%
      st_as_sf(coords = c("lon", "lat"), crs = st_crs(r.list[[state]])) %>%
      st_transform(crs=4326) %>%
      cbind(st_coordinates(.)) %>%
      rename(lon="X", lat="Y", dem=paste0("dem_", state)) %>%
      # Convert to data.table
      setDT()
    
    # Fit model based on location, with dem as response
    fit <- lm(dem ~ lat * lon + I(lat^2) + I(lon^2), 
              data=r.df[!is.na(dem)])
    # Get residuals from the model as "de-trended" dem values
    r.df[!is.na(dem), dem.detrended := residuals(fit)]
    # Scale de-trended values
    r.df[, dem.detrended := (dem.detrended - min(dem.detrended, na.rm=T)) / 
           (max(dem.detrended, na.rm=T) - min(dem.detrended, na.rm=T))]
    # Initialize empty raster template
    r.dem <- ext(r.list[[state]]) %>% 
            rast(res=res(r.list[[state]]), crs=crs(r.list[[state]]))
    # Fill with de-trended dem values
    r.dem[r.df$cell] <- r.df$dem.detrended
    names(r.dem) <- "dem.detrended"
    
    # Write outputs
    writeRaster(r.dem, output.file, overwrite=T)
  }
}

Feature Selection

Feature selection is the process of determining the most relevant input variables to use in modeling or analysis. By selecting the most informative features, one can improve the model’s accuracy, reduce overfitting, reduce issues due to multi-collinearity and spatial autocorrelation, and decrease the computational cost of training.

Load all Original Features and Feature Engineering Outputs

In this section, the final dataset that will be used for feature selection is prepared. This involves:

  • Loading the pseudo-absence and observation datasets.
  • Fetching the feature-engineered raster data (geospatial data layers) and associating these rasters with the observational data.
  • Loading the original raster layers, as these should still be included during the feature selection process.
  • Converting the original land cover feature into binary variables.
  • Saving the final aggregated dataset.
final.output.dir <- "artifacts/final_data"
if (!dir.exists(final.output.dir)) dir.create(final.output.dir)
final.fpath <- file.path(final.output.dir, "final_data.rds")

if (!file.exists(final.fpath)) {
  # Combine observation/pseudo-absence data
  df <- c(list.files(file.path("data", "final_pseudo_absence"), full.names = T),
          list.files(file.path("data", "final"), full.names = T)) %>%
    purrr::map_df(readRDS)
  
  # Get feature engineered raster data
  output.dir <- "artifacts/feature_engineered_final"
  if (!dir.exists(output.dir)) dir.create(output.dir)
  get.fe <- all(case_when(is_empty(list.files(output.dir)) ~ T,
                          list.files(output.dir) < length(states) ~ T,
                          T ~ F))
  fe.r.list <- list()
  if (get.fe) {
    for (state in states) {
      fe.r.list[[state]] <- list.files(
        "artifacts/feature_engineering", full.names = T)[
          grepl(
            paste0("^", state, "_"), 
            list.files("artifacts/feature_engineering", full.names = F)
          )
        ] %>% rast()
      # Save as multi-layer rasters by state
      writeRaster(fe.r.list[[state]], 
                  file.path(output.dir, paste0(state, ".tif")),
                  overwrite=T)
    }
  } else {
    for (state in states) {
      fe.r.list[[state]] <- rast(file.path(output.dir, paste0(state, ".tif")))
    }
  }
  
  # Add empty fields in dataframe for each new raster 
  purrr::map(fe.r.list, names) %>% 
    reduce(c) %>% 
    unique() %>% 
    sort() %>%
    purrr::walk(function(n) {
      if (!(n %in% names(df))) df[[n]] <<- 0
    })
  
  # Update point crs to match fe rasters
  df <- st_transform(df, st_crs(fe.r.list[[1]])) 
  df.list <- list()
  # From state multi-layer rasters, extract values from each point in df
  for (st in states) {
    cat(sprintf("Extracting points to values for %s...\n", st))
    r <- fe.r.list[[st]]
    df.list[[st]] <- df %>% filter(state == st)
    # Extract raster values
    for (r.name in names(r)) {
      cat("\tExtracting", r.name, "values for", st, "\n")
      x <- terra::extract(r[[r.name]], df.list[[st]])[[r.name]]
      df.list[[st]] <- mutate(df.list[[st]], !!r.name := x)
    }
  }
  # Combine list
  df <- do.call("rbind", df.list)
  rm(df.list)
  gc()
  # Update crs back
  df <- st_transform(df, 4326)
  # Remove rownames
  rownames(df) <- NULL
  
  # Convert land cover to binary variables
  binary.lc <- caret::dummyVars(~NLCD_Land, data=df, sep="") %>% 
    predict(., df) %>%
    as.data.frame()
  
  names(binary.lc) <- gsub("NLCD_Land", "", names(binary.lc)) %>% 
    gsub("\\/| |,", "_", .) %>% 
    gsub("__", "_", .) %>% 
    tolower()
  
  # Remove duplicates from feature engineering
  binary.lc <- binary.lc %>%
    select(-c("unclassified", "perennial_snow_ice", "barren_land",
              "shrub_scrub", "herbaceous"))
  # Combine with dataframe, remove land cover categorical var
  df <- df %>% select(-NLCD_Land) %>% cbind(binary.lc)
  saveRDS(df, final.fpath)
} else {
  df <- readRDS(final.fpath)
}

# Convert to data.table
df %>% setDT()

# View output
df %>% as_tibble()

Correlation

In this section, correlation between the predictors and the dependent variable (observations) for each species are examined. The correlation measure assists in identifying which predictors have a linear relationship with the response variable and can, therefore, impact model predictions.

obs.cor <- purrr::map(sort(unique(df$common.name)), function(spec) {
  corr.matrix <- cor(select(df[common.name == spec], 
                            -c("state", "common.name", "geometry")))
  obs.cor <- corr.matrix[which(rownames(corr.matrix) == "observations"),] %>%
    as.data.frame()
  obs.cor$variable <- rownames(obs.cor)
  obs.cor %>%
    filter(variable != "observations") %>%
    rename(!!spec := `.`) %>%
    arrange(-abs(!!sym(spec))) %>%
    mutate(!!spec := paste0(variable, ": ", signif(!!sym(spec), 2))) %>%
    select(!!sym(spec)) 
}) %>% 
  do.call("cbind", .)

obs.cor %>% as_tibble()

Train/Test Split

Ensuring the quality and robustness of a model requires that it is evaluated on unseen data. Similarly, it is important to avoid “data leakage”, which occurs when information from the test set inadvertently influences the training process. Reasons for data leakage can range from pre-processing steps being applied to the entire dataset instead of separately to train and test sets, to more subtle causes like target leakage where future information inadvertently gets used in the past due to data preparation mistakes.

In this section, the data is split into a training set and a test set. The training set is used for training the model, while the test set is reserved to evaluate the model’s performance. The data is stratified based on latitude, longitude, species, state, and absence to ensure that the distribution of these variables is consistent between the train and test sets. This stratification helps maintain representative samples and mitigates potential biases.

# Set seed for splitting and modeling
set.seed(19)

stratified.split.idx <- function(df, p=0.7, lat.lon.bins=25) {
  # Cut along lat/lon values to create grids (lat.bin & lon.bin)
  # lat.lon.bins is the number of divisions you want
  df$lat.bin <- cut(df$lat, breaks=lat.lon.bins, labels = F)
  df$lon.bin <- cut(df$lon, breaks=lat.lon.bins, labels = F)
  df$absence <- df$observations == 0
  
  # Create a new variable combining the stratification variables
  df %>%
    # stratify on lat/lon bins, species, state, and absence
    mutate(strata = paste(lat.bin, lon.bin, common.name, state, absence)) %>%
    pull(strata) %>%
    # Create the data partitions
    createDataPartition(., p = p, list = F) %>%
    suppressWarnings()
}

prepare.data <- function(df, p=.7, lat.lon.bins=25) {
  train.index <- stratified.split.idx(df, p=p, lat.lon.bins = lat.lon.bins)
  df.train <- df[train.index, ]
  df.test <- df[-train.index, ]
  
  list(train = df.train, 
       test = df.test,
       index = train.index)
}

# Get train/test indices
train.test <- prepare.data(df, .7)

# Split; Remove non-predictive variables
df.train <- df[train.test$index,] 
df.train[, `:=` (geometry=NULL)]
df.test <- df[-train.test$index,]
df.test[, `:=` (geometry=NULL)]

train.x <- df.train %>% select(-observations)
train.y <- df.train$observations

test.x <- df.test %>% select(-observations)
test.y <- df.test$observations

For computational efficiency, models are cached. This section provides a function to retrieve the model from cache if it exists or save it to the cache if it doesn’t. This approach speeds up the modeling process, especially when iterating and fine-tuning various models, by avoiding retraining on the same dataset.

# Get model from cache if it has been saved before
get.model <- function(model, file.name, model.path) {
  f.path <- file.path(model.path, file.name)
  if (!dir.exists(model.path)) {
    dir.create(model.path)
  }
  # Model cache check
  if (ifelse(file.exists(f.path), T, F)) {
    model <- readRDS(f.path)
  } else {
    saveRDS(model, f.path)
  }
  model
}

LASSO Models - Importance from Coefficients

LASSO (Least Absolute Shrinkage and Selection Operator) regression is a type of linear regression that uses shrinkage. Here, data values are shrunk towards a central point, like the mean. The lasso procedure encourages simple, sparse models (i.e., models with fewer parameters).

In this section, a LASSO regression model is fitted for each species in each state. The regularization strength is controlled by the lambda parameter, where a value of zero results in regular linear regression and increasing values result in more regularization. By fitting LASSO models and examining the coefficients, we can determine the importance of each variable and variable interaction. Variables/interactions with non-zero coefficients are considered important, while those with coefficients shrunk to zero are considered non-informative.

species <- sort(unique(df.train$common.name))
if (!dir.exists("artifacts/models")) dir.create("artifacts/models")

# Define the control for the train function
ctrl <- trainControl(method = "cv", number = 5)

lasso.list <- purrr::map(species, function(spec) {
  spec.state.fit <- purrr::map(states, function(st) {
    cat("Fitting LASSO model for", spec, "in", st, "\n")
    spec.df <- df.train[common.name == spec & state == st][
      , `:=` (state=NULL, common.name = NULL)]
    
    # Remove any columns where all values are the same
    .remove <- names(which(sapply(spec.df, function(x) length(unique(x)) <= 1)))
    .remove <- .remove[.remove != "observations"]
    if (!is_empty(.remove)) {
      spec.df <- spec.df %>% select(-.remove)
    }
    
    # Scale data
    # Identify fields to center/scale
    to.scale <- sapply(spec.df, function(x) is.numeric(x) && 
                         length(unique(x)) > 2)
    to.scale$observations <- F
    to.scale <- names(spec.df) %in% names(which(unlist(to.scale)))
    
    # Define the pre-processing steps (use the training data to avoid data leakage)
    # Apply centering and scaling to the non-binary fields and non-target
    preproc <- preProcess(spec.df[, ..to.scale], 
                          method = c("center", "scale"))
    
    # Center/Scale the training data
    df.train.scaled <- bind_cols(spec.df[, !(..to.scale)],
                                 predict(preproc, spec.df[, ..to.scale]))
    
    # LASSO (L1); Elastic Net, where alpha = 1
    fit.l1 <- get.model(
      train(observations ~ (.)^2, 
            data = df.train.scaled, 
            method = "glmnet",
            trControl = ctrl,
            tuneGrid = expand.grid(alpha = 1, 
                                   lambda = seq(0, 1, by = 0.1)),
            metric = "RMSE"),
      file.name=paste0(tolower(gsub(" ", "_", spec)), "_", st, "_regr_l1.rds"),
      model.path="artifacts/models/lasso_fs")
    
    gc()
    fit.l1
  })
  names(spec.state.fit) <- states
  spec.state.fit
})
names(lasso.list) <- species
# Get variable importance for LASSO models, based on coefficients
spec.state <- expand.grid(common.name=species, state=states, stringsAsFactors=F)
lasso.imp <- purrr::map_df(1:nrow(spec.state), function(i) {
  spec <- spec.state[i,]$common.name
  st <- spec.state[i,]$state
  fit <- lasso.list[[spec]][[st]]
  coef.df <- coef(fit$finalModel, s = fit$bestTune$lambda) %>%
    as.matrix() %>%
    as.data.frame()
  # Remove the intercept
  coef.df <- coef.df[-1, , drop = F]
  
  # Create a data frame of variable importance
  var.importance <- tibble(
    common.name = spec,
    state = st,
    variable = rownames(coef.df),
    importance = abs(coef.df[,1])
  ) %>%
    # Rank variables by importance
    arrange(state, common.name, -importance, variable) %>%
    # Only look at variables where imp. > 0
    filter(importance > 0)
})

lasso.imp

Decision Tree Models - Variable Importance

Decision trees are a class of machine learning models that recursively split the dataset into subsets based on the value of input variables. The goal is to make the data within each subset as homogeneous as possible regarding the target variable. The importance of a variable in a decision tree model can be assessed by the frequency and depth at which it is used to split the data. Variables used frequently and closer to the root of the tree are typically considered more important.

The primary advantage of using a tree-based model as opposed to a model like LASSO is that the relationships do not need to be linear, and interactions are naturally accounted for due to the hierarchical nature of the model itself.

In the code below, Decision Tree models are trained for each species in each state. The variable importance is extracted from the tree structure itself, which offers an intuitive understanding of the relationships and hierarchies within the data.

# Decision Tree Variable Importance
tree.list <- purrr::map(species, function(spec) {
  spec.state.fit <- purrr::map(states, function(st) {
    cat("Fitting Decision Tree model for", spec, "in", st, "\n")
    spec.df <- df.train[common.name == spec & state == st][
      , `:=` (state=NULL, common.name = NULL)]
    
    # Remove any columns where all values are the same
    .remove <- names(which(sapply(spec.df, function(x) length(unique(x)) <= 1)))
    .remove <- .remove[.remove != "observations"]
    if (!is_empty(.remove)) {
      spec.df <- spec.df %>% select(-.remove)
    }

    # Fit the Decision Tree model
    fit.tree <- get.model(
      rpart::rpart(observations ~ ., data=spec.df, method="anova",
                   control=rpart::rpart.control(cp=0.001)),
      file.name=paste0(tolower(gsub(" ", "_", spec)), "_", st, "_tree.rds"),
      model.path="artifacts/models/trees_fs")
    
    gc()
    fit.tree
  })
  names(spec.state.fit) <- states
  spec.state.fit
})
names(tree.list) <- species
# Get variable importance for decision tree models
tree.imp <- purrr::map_df(1:nrow(spec.state), function(i) {
  spec <- spec.state[i,]$common.name
  st <- spec.state[i,]$state
  fit <- tree.list[[spec]][[st]]
  vi <- fit$variable.importance
  # Create a data frame of variable importance
  var.importance <- tibble(
    common.name = spec,
    state = st,
    variable = names(vi),
    importance = (vi - min(vi)) / (max(vi) - min(vi))
  ) %>%
    # Rank variables by importance
    arrange(state, common.name, -importance, variable)
})

tree.imp

Conclusion

Predictive modeling is iterative by nature, and thus conclusively selecting features to be included in the modeling process is impractical. The purpose of this exploratory analysis is just that - exploratory. The insights and results of this part of the study will be utilized in the iterative modeling process, and adjusted as needed.